In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
sns.set_style(style='darkgrid')
In [3]:
# Remove scientific notations and display numbers with 2 decimal points instead
pd.options.display.float_format = '{:,.2f}'.format        
In [4]:
df = pd.read_csv('concrete.csv')
df.head(10)
Out[4]:
cement slag ash water superplastic coarseagg fineagg age strength
0 141.30 212.00 0.00 203.50 0.00 971.80 748.50 28 29.89
1 168.90 42.20 124.30 158.30 10.80 1,080.80 796.20 14 23.51
2 250.00 0.00 95.70 187.40 5.50 956.90 861.20 28 29.22
3 266.00 114.00 0.00 228.00 0.00 932.00 670.00 28 45.85
4 154.80 183.40 0.00 193.30 9.10 1,047.40 696.70 28 18.29
5 255.00 0.00 0.00 192.00 0.00 889.80 945.00 90 21.86
6 166.80 250.20 0.00 203.50 0.00 975.60 692.60 7 15.75
7 251.40 0.00 118.30 188.50 6.40 1,028.40 757.70 56 36.64
8 296.00 0.00 0.00 192.00 0.00 1,085.00 765.00 28 21.65
9 155.00 184.00 143.00 194.00 9.00 880.00 699.00 28 28.99
In [ ]:
#Exploratory data quality report
In [ ]:
#1. Univariate analysis
In [5]:
#data types, Non null values
df.info()

# 9 columns and 1030 rows
# All rows are non-null
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cement        1030 non-null   float64
 1   slag          1030 non-null   float64
 2   ash           1030 non-null   float64
 3   water         1030 non-null   float64
 4   superplastic  1030 non-null   float64
 5   coarseagg     1030 non-null   float64
 6   fineagg       1030 non-null   float64
 7   age           1030 non-null   int64  
 8   strength      1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB
In [6]:
#statistical summary - range of values observed, central values (mean and median), standard deviation and quartiles - Q1 (25%), Q2 (50%), Q3 (75%)
df.describe().transpose()
Out[6]:
count mean std min 25% 50% 75% max
cement 1,030.00 281.17 104.51 102.00 192.38 272.90 350.00 540.00
slag 1,030.00 73.90 86.28 0.00 0.00 22.00 142.95 359.40
ash 1,030.00 54.19 64.00 0.00 0.00 0.00 118.30 200.10
water 1,030.00 181.57 21.35 121.80 164.90 185.00 192.00 247.00
superplastic 1,030.00 6.20 5.97 0.00 0.00 6.40 10.20 32.20
coarseagg 1,030.00 972.92 77.75 801.00 932.00 968.00 1,029.40 1,145.00
fineagg 1,030.00 773.58 80.18 594.00 730.95 779.50 824.00 992.60
age 1,030.00 45.66 63.17 1.00 7.00 28.00 56.00 365.00
strength 1,030.00 35.82 16.71 2.33 23.71 34.45 46.14 82.60
In [ ]:
##Mean < Medaian (Q2-50%) => -ve skew
#strength
#fineagg

##Mean > Medaian (Q2-50%) => +ve skew
#slag
#ash
#age
In [7]:
#Number of Unique values
df.nunique()
Out[7]:
cement          278
slag            185
ash             156
water           195
superplastic    111
coarseagg       284
fineagg         302
age              14
strength        845
dtype: int64
In [8]:
#Get all non-nulls - missing values check
df[~df.isna()]
Out[8]:
cement slag ash water superplastic coarseagg fineagg age strength
0 141.30 212.00 0.00 203.50 0.00 971.80 748.50 28 29.89
1 168.90 42.20 124.30 158.30 10.80 1,080.80 796.20 14 23.51
2 250.00 0.00 95.70 187.40 5.50 956.90 861.20 28 29.22
3 266.00 114.00 0.00 228.00 0.00 932.00 670.00 28 45.85
4 154.80 183.40 0.00 193.30 9.10 1,047.40 696.70 28 18.29
... ... ... ... ... ... ... ... ... ...
1025 135.00 0.00 166.00 180.00 10.00 961.00 805.00 28 13.29
1026 531.30 0.00 0.00 141.80 28.20 852.10 893.70 3 41.30
1027 276.40 116.00 90.30 179.60 8.90 870.10 768.30 28 44.28
1028 342.00 38.00 0.00 228.00 0.00 932.00 670.00 270 55.06
1029 540.00 0.00 0.00 173.00 0.00 1,125.00 613.00 7 52.61

1030 rows × 9 columns

In [9]:
#Incorrect Imputations
df[~df.applymap(np.isreal).all(1)]
Out[9]:
cement slag ash water superplastic coarseagg fineagg age strength
In [ ]:
#analysis of the body of distributions / tails; Check for outliers
In [10]:
df.hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2)); 
In [11]:
plt.figure(figsize=(15,10))
pos = 1
for i in df.drop(columns = 'strength').columns:
    plt.subplot(3, 3, pos)
    sns.boxplot(df[i])
    pos += 1 
In [ ]:
#Age has Outliers
#Water and superplastic also have outliers
#slag and fineagg also have few Outliers 
In [12]:
import pandas_profiling 
#Getting the pandas profiling report and check for incorrect imputation
pandas_profiling.ProfileReport(df)



Out[12]:

In [ ]:
#There are 25 duplicate rows
#slag has 471 (45.7%) zeros
#ash has 566 (55.0%) zeros
#superplastic has 379 (36.8%) zeros

#845 distinct values of Strength
In [13]:
#Drop duplicates
df = df.drop_duplicates()
In [ ]:
#2. Bi-variate analysis
In [14]:
#corelation between the predictor variables and between the predictor variables and target column
df.corr()
Out[14]:
cement slag ash water superplastic coarseagg fineagg age strength
cement 1.00 -0.30 -0.39 -0.06 0.06 -0.09 -0.25 0.09 0.49
slag -0.30 1.00 -0.31 0.13 0.02 -0.28 -0.29 -0.04 0.10
ash -0.39 -0.31 1.00 -0.28 0.41 -0.03 0.09 -0.16 -0.08
water -0.06 0.13 -0.28 1.00 -0.65 -0.21 -0.44 0.28 -0.27
superplastic 0.06 0.02 0.41 -0.65 1.00 -0.24 0.21 -0.19 0.34
coarseagg -0.09 -0.28 -0.03 -0.21 -0.24 1.00 -0.16 -0.01 -0.14
fineagg -0.25 -0.29 0.09 -0.44 0.21 -0.16 1.00 -0.16 -0.19
age 0.09 -0.04 -0.16 0.28 -0.19 -0.01 -0.16 1.00 0.34
strength 0.49 0.10 -0.08 -0.27 0.34 -0.14 -0.19 0.34 1.00
In [15]:
plt.figure(figsize=(10,8))

sns.heatmap(df.corr(),
            annot=True,
            linewidths=.5,
            center=0,
            cbar=False,
            cmap="YlGnBu")

plt.show()
In [ ]:
#superplastic is positively correlated with ash
#superplastic is negtively correlated with water
#fineagg is slightly negtively correlated with water


#Strength is positively corelated with cement
#Strength is slightly positively corelated with superplastic and age
In [16]:
sns.pairplot(df,diag_kind='kde');
In [ ]:
#3. Feature Engineering techniques
In [ ]:
#a. Identify opportunities (if any) to extract a new feature from existing features

#Polynomial Features are added below

#Ash is the feature that can be removed as it has very low correlation with Strength.
#Features is not removed as the desired R2 score is achived without removal of feature.
In [ ]:
#b. Get data model ready and do a train test split
In [17]:
X = df.drop('strength' , axis=1)
y = df['strength']
In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=7)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[18]:
((703, 8), (302, 8), (703,), (302,))
In [19]:
X_train.head()
Out[19]:
cement slag ash water superplastic coarseagg fineagg age
471 148.00 175.00 0.00 171.00 2.00 1,000.00 828.00 28
843 300.00 0.00 0.00 184.00 0.00 1,075.00 795.00 7
188 252.30 0.00 98.80 146.30 14.20 987.80 889.00 100
251 290.40 0.00 96.20 168.10 9.40 961.20 865.00 28
71 342.00 38.00 0.00 228.00 0.00 932.00 670.00 90
In [ ]:
#c. Check for higher degree attributes, should it be linear, quadratic or higher degree? Use Polynomial Features (Consider degree 2 and 3)
In [20]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
In [21]:
#degree 1 Polynomial Feature
poly1 = PolynomialFeatures(degree=1, interaction_only=True)
X_train1 = poly1.fit_transform(X_train)
X_test1 = poly1.fit_transform(X_test)

poly1_clf = linear_model.LinearRegression()

poly1_clf.fit(X_train1, y_train)

y1_pred = poly1_clf.predict(X_test1)

#In sample (training) R^2 will always improve with the number of variables!
print("Training score of degree 1 Polynomial Features",poly1_clf.score(X_train1, y_train))

#Out off sample (testing) R^2 is our measure of sucess and does improve
print("Testing score of degree 1 Polynomial Features", poly1_clf.score(X_test1, y_test))
Training score of degree 1 Polynomial Features 0.5848808498334843
Testing score of degree 1 Polynomial Features 0.6433923539887811
In [22]:
#degree 2 Polynomial Feature
poly2 = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly2.fit_transform(X_train)
X_test2 = poly2.fit_transform(X_test)

poly2_clf = linear_model.LinearRegression()

poly2_clf.fit(X_train2, y_train)

y2_pred = poly2_clf.predict(X_test2)

#print(y2_pred)

#In sample (training) R^2 will always improve with the number of variables!
print("Training score of degree 2 Polynomial Features",poly2_clf.score(X_train2, y_train))

#Out off sample (testing) R^2 is our measure of sucess and does improve
print("Testing score of degree 2 Polynomial Features", poly2_clf.score(X_test2, y_test))
Training score of degree 2 Polynomial Features 0.7552496688240806
Testing score of degree 2 Polynomial Features 0.6911067144270631
In [ ]:
#Degree2 is much better than Degree1 Polynomial Feature. Lets's check degree 3
In [23]:
#degree 3 Polynomial Feature
poly3 = PolynomialFeatures(degree=3, interaction_only=True)
X_train3 = poly3.fit_transform(X_train)
X_test3 = poly3.fit_transform(X_test)

poly_clf3 = linear_model.LinearRegression()

poly_clf3.fit(X_train3, y_train)

y_pred3 = poly_clf3.predict(X_test3)

#In sample (training) R^2 will always improve with the number of variables!
print("Training score of degree 3 Polynomial Features", poly_clf3.score(X_train3, y_train))

#Out off sample (testing) R^2 is our measure of sucess and does improve
print("Testing score of degree 3 Polynomial Features", poly_clf3.score(X_test3, y_test))
Training score of degree 3 Polynomial Features 0.7363793826425911
Testing score of degree 3 Polynomial Features 0.6697958897062548
In [ ]:
# Degree 3 Polynomial Feature is over fit. Testing scorr of degree2 is better than degree3. 
#So the best Polynominal feature is degree 2
In [24]:
#Shapes
print("Shape of Original Training Set", X_train.shape)
print("Shape of Degree 1 Training Set", X_train1.shape)
print("Shape of Degree 2 Training Set", X_train2.shape)
print("Shape of Degree 3 Training Set", X_train3.shape)
Shape of Original Training Set (703, 8)
Shape of Degree 1 Training Set (703, 9)
Shape of Degree 2 Training Set (703, 37)
Shape of Degree 3 Training Set (703, 93)
In [ ]:
#Creating the model and tuning it
In [25]:
model= []
trainscore = []
testscore = []
rmse = []

# Blanks list to store model name, training score, testing score, RMSE (Root Mean Square Error)
In [26]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
In [27]:
#Use Kfold Cross Validation to evaluate model performance
num_folds = 10
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)

scores = cross_val_score(poly2_clf, X_train2, y_train, cv=num_folds)
scores.mean()
print("Accuracy: %.3f%% (%.3f%%)" % (scores.mean()*100.0, scores.std()*100.0))
Accuracy: 71.042% (4.835%)
In [28]:
#Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
dTree = DecisionTreeRegressor()

dTree.fit(X_train2, y_train)

print(dTree.score(X_test2, y_test))

model.append('Decision Tree')
trainscore.append(dTree.score(X_train2, y_train))
testscore.append(dTree.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,dTree.predict(X_test2))**0.5)
0.8158916126838516
In [29]:
#Pruned Decision Tree Regression
dTreeR = DecisionTreeRegressor(random_state = 7, max_depth=25, min_samples_leaf=5)

dTreeR.fit(X_train2, y_train)

print(dTreeR.score(X_test2, y_test))

model.append('Decision Tree Pruned')
trainscore.append(dTreeR.score(X_train2, y_train))
testscore.append(dTreeR.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,dTreeR.predict(X_test2))**0.5)
0.7958998910877095
In [30]:
#Bagging Regression
from sklearn.ensemble import BaggingRegressor

bgcl = BaggingRegressor(base_estimator=poly2_clf, n_estimators=200,random_state=7)

bgcl = bgcl.fit(X_train2, y_train)

bgcl_predict = bgcl.predict(X_test2)

print(bgcl.score(X_test2 , y_test))


model.append('Bagging')
trainscore.append(bgcl.score(X_train2, y_train))
testscore.append(bgcl.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,bgcl.predict(X_test2))**0.5)
0.6845535376203928
In [31]:
# AdaBoost Regression
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(base_estimator=poly2_clf,random_state=7,n_estimators= 200, learning_rate=0.1)

ada.fit(X_train2, y_train)

ada_predict = ada.predict(X_test2)

print(ada.score(X_test2 , y_test))

model.append('Ada Boosting')
trainscore.append(ada.score(X_train2, y_train))
testscore.append(ada.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,ada.predict(X_test2))**0.5)
0.7100323234324268
In [32]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

grad = GradientBoostingRegressor(random_state=7, n_estimators=200)

grad.fit(X_train2, y_train)

grad_predict = grad.predict(X_test2)

print(grad.score(X_test2 , y_test))

model.append('Gradient Boosting')
trainscore.append(grad.score(X_train2, y_train))
testscore.append(grad.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,grad.predict(X_test2))**0.5)
0.9319198731728782
In [33]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor

rand = RandomForestRegressor(random_state=7, n_estimators=200)

rand.fit(X_train2, y_train)

rand_predict = grad.predict(X_test2)

print(rand.score(X_test2 , y_test))

model.append('Random Forest')
trainscore.append(rand.score(X_train2, y_train))
testscore.append(rand.score(X_test2, y_test))
rmse.append(mean_squared_error(y_test,rand.predict(X_test2))**0.5)
0.9087854819829541
In [34]:
# DataFrame to compare results.

results = pd.DataFrame()
results['Model'] = model
results['Training Score'] = trainscore
results['Testing Score'] = testscore
results['Root Mean Square Error'] = rmse
results = results.set_index('Model')
results
Out[34]:
Training Score Testing Score Root Mean Square Error
Model
Decision Tree 1.00 0.82 6.91
Decision Tree Pruned 0.94 0.80 7.28
Bagging 0.75 0.68 9.05
Ada Boosting 0.75 0.71 8.67
Gradient Boosting 0.98 0.93 4.20
Random Forest 0.98 0.91 4.86
In [ ]:
#2 Best models are Gradient Boosting and Random Forest. R2 score is high and RMSE is low.
In [35]:
#Kfold Cross Validation to evaluate Gradient Boosting model performance
num_folds = 10
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)

scores = cross_val_score(grad, X_train2, y_train, cv=num_folds)
scores.mean()
print("Accuracy after cross validation for Gradient Boosting: %.3f%% (%.3f%%)" % (scores.mean()*100.0, scores.std()*100.0))
Accuracy after cross validation for Gradient Boosting: 90.395% (2.218%)
In [36]:
#Kfold Cross Validation to evaluate Random Forest model performance

scores1 = cross_val_score(rand, X_train2, y_train, cv=num_folds)
scores1.mean()
print("Accuracy after cross validation for Random Forest: %.3f%% (%.3f%%)" % (scores1.mean()*100.0, scores1.std()*100.0))
Accuracy after cross validation for Random Forest: 88.566% (2.643%)
In [37]:
#Get Gradient boosting params
grad.get_params()
Out[37]:
{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': 7,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}
In [38]:
#Get Random Forest params
rand.get_params()
Out[38]:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 7,
 'verbose': 0,
 'warm_start': False}
In [39]:
h_model= []
h_trainscore = []
h_testscore = []
h_rmse = []

# Blanks list to store model name, training score, testing score, RMSE after hyperparameter tuning
In [ ]:
# I have experimented Random Search and Grid Serach. 
#Random Search had better results and faster run time. So I choose Random Search and displaying only Random Search results.
In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time
In [41]:
# specify parameters and distributions to sample for Random Forest
param_dist = {"max_depth": [3, 7, 25, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False]}
In [42]:
# run randomized search for Random Forest
samples = 10  # number of random samples 
randomCV = RandomizedSearchCV(rand, param_distributions=param_dist, n_iter=samples)
randomCV.fit(X_train2, y_train)
print(randomCV.best_params_)

#Storing Scores
h_model.append('RandomizedSearchCV Random Forest')
h_trainscore.append(randomCV.score(X_train2, y_train))
h_testscore.append(randomCV.score(X_test2, y_test))
h_rmse.append(mean_squared_error(y_test,randomCV.predict(X_test2))**0.5)
{'bootstrap': False, 'max_depth': None, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 2}
In [43]:
parameters = {
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25, 0.5],
    "min_samples_split": [2, 3, 5, 6, 9, 10],
    "min_samples_leaf": [1, 2, 3, 6, 9, 10],
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10, 100, 200, 500]
    }
In [44]:
rand_grad = RandomizedSearchCV(grad, param_distributions=parameters, cv=10, n_jobs=-1)
rand_grad.fit(X_train2, y_train)
print(rand_grad.score(X_train2, y_train))
print(rand_grad.best_params_)
0.9760750753284413
{'subsample': 0.618, 'n_estimators': 500, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 5, 'learning_rate': 0.025, 'criterion': 'mae'}
In [45]:
h_model.append('RandomizedSearchCV Gradient boosting')
h_trainscore.append(rand_grad.score(X_train2, y_train))
h_testscore.append(rand_grad.score(X_test2, y_test))
h_rmse.append(mean_squared_error(y_test,rand_grad.predict(X_test2))**0.5)
In [47]:
Rand_results = pd.DataFrame()
Rand_results['Model'] = h_model
Rand_results['Training Score'] = h_trainscore
Rand_results['Testing Score'] = h_testscore
Rand_results['Root Mean Square Error'] = h_rmse
Rand_results = Rand_results.set_index('Model')
Rand_results
Out[47]:
Training Score Testing Score Root Mean Square Error
Model
RandomizedSearchCV Random Forest 0.99 0.92 4.57
RandomizedSearchCV Gradient boosting 0.98 0.93 4.27
In [48]:
##RandomizedSearchCV of Gradient boosting is the best model. 
#Training Score of 98%, 
#Testing Score of 93% 
#RMSE OF 4.27
In [ ]:
 
In [ ]:
 
In [ ]: